tempfile ma sin

/**********************************************
	NOAA weather
**********************************************/

* residual weather 1: non-parametric: average across years, then take moving average for the week around the day of the year
local weather temp_noaa_6a4p dew_noaa_6a4p pressure_6a4p wind_speed_noaa_6a4p prec_noaa_6a4p sky_noaa_6a4p 
use location date `weather' using data/weather/semirawdata/NOAA_6a4p.dta, clear
isid location date
gen int doy = doy(date)
collapse `weather', by(location doy)
expand 2 if inlist(doy,1,2,3,364,365,366), gen(extension)
replace doy = doy + 366 if inlist(doy,1,2,3) & extension
replace doy = doy - 366 if inlist(doy,364,365,366) & extension
encode location, gen(loc)
tsset loc doy
foreach var of varlist `weather' {
	tssmooth ma `var'_ma = `var', window(3 1 3)
	}
drop if !inrange(doy,1,366)
keep location doy *_ma
compress
save `ma'

* residual weather 2: sin fit for temperature (takes a long time to run)
use location date temp_noaa_6a4p using data/weather/semirawdata/NOAA_6a4p.dta, clear
gen int doy = doy(date)
gen int year = year(date)
gen residual_temp = .
encode location, gen(loc) label(location)
sum loc
forvalues loc=`r(min)'/`r(max)' {
	qui count if loc==`loc' & !mi(temp)
	di "location: `: label location `loc'', days in sample: `=r(N)'"
	if r(N)>5 { // some locations have only a handful of observations, which makes nl crash
		nl (temp_noaa_6a4p = {trend=0}*(year-2004.5) + {average=50} + (1/2)*{range=40}*sin(_pi/2 + (doy-{hottestday=190})*_pi/183) ) if loc==`loc'
		predict holdervar if e(sample), resid
		replace residual_temp = holdervar if e(sample)
		drop holdervar
	}
}
label var residual "temp minus prediction from location-specific sine fit"
bys loc: egen res_sd = sd(res)
gen stud_res = residual/res_sd
gen byte stud_res_band = floor(abs(stud_res))
label var res_sd "location-specific std. dev. of residual_temp"
label var stud_res "studentized residual: residual_temp/res_sd"
label var stud_res_band "floor(abs(stud_res))"
drop loc doy year
compress
save `sin'

* main weather
use data/weather/semirawdata/NOAA_6a4p.dta, clear
drop pm25_6a4p pm25_local_6a4p ozone_6a4p carbon_monoxide_6a4p // these actually come from AQS, were merged into NOAA data in the process of creating the 6am-4pm averages from hourly data
format date %td

* merge in residuals and AQS
gen int doy = doy(date)
merge m:1 location_code doy using `ma', assert(3) nogenerate
foreach var of varlist `weather' {
	gen residual_`var' = `var' - `var'_ma
	}
drop *_ma doy
merge 1:1 location_code date using `sin', assert(3) nogenerate
merge 1:1 location_code date using data/weather/semirawdata/AQS_daily_optimized_coverage.dta, keep(1 3) nogenerate keepusing(pm25 ozone carbon_monoxide)

* clean up & save
compress
save data/weather/NOAA_AQS, replace